The health insurance coverage data was compiled from the US Department of Health and Human Services and the US Census Bureau.
The Affordable Care Act (ACA) is the name for the comprehensive healthcare reform law and its amendments which address health insurance coverage, healthcare costs, and preventive care. The law was enacted in two parts: The Patient Protection and Affordable Care Act was signed into law on March 23, 2010, by President Barack Obama and was amended by the Health Care and Education Reconciliation Act on March 30, 2010.
This dataset provides health insurance coverage data for each state and the nation as a whole, including variables such as the uninsured rates before and after Obamacare, estimates of individuals covered by employer and marketplace healthcare plans, and enrollment in Medicare and Medicaid programs.
The project is aimed at self-studying the impact of the Affordable Care Act.
# Package loading
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(tmaptools)
library(ggcorrplot)
# Data loading
df = read_csv("aca data.csv")
## Rows: 52 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): State, Uninsured Rate (2010), Uninsured Rate (2015), Uninsured Rate...
## dbl (8): Health Insurance Coverage Change (2010-2015), Employer Health Insur...
## lgl (1): State Medicaid Expansion (2016)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 52 × 14
## State Uninsured Rate (2010…¹ Uninsured Rate (2015…² Uninsured Rate Chang…³
## <chr> <chr> <chr> <chr>
## 1 Alabama 14.6% 10.1% -4.5%
## 2 Alaska 19.9% 14.9% -5%
## 3 Arizona 16.9% 10.8% -6.1%
## 4 Arkansas 17.5% 9.5% -8%
## 5 Califor… 18.5% 8.6% -9.9%
## 6 Colorado 15.9% 8.1% -7.8%
## 7 Connect… 9.1% 6% -3.1%
## 8 Delaware 9.7% 5.9% -3.8%
## 9 Distric… 7.6% 3.8% -3.8%
## 10 Florida 21.3% 13.3% -8%
## # ℹ 42 more rows
## # ℹ abbreviated names: ¹​`Uninsured Rate (2010)`, ²​`Uninsured Rate (2015)`,
## # ³​`Uninsured Rate Change (2010-2015)`
## # ℹ 10 more variables: `Health Insurance Coverage Change (2010-2015)` <dbl>,
## # `Employer Health Insurance Coverage (2015)` <dbl>,
## # `Marketplace Health Insurance Coverage (2016)` <dbl>,
## # `Marketplace Tax Credits (2016)` <dbl>, …
# Rename column
names(df)[names(df) == "Uninsured Rate (2010)"] <- "Unin_Rate_2010"
names(df)[names(df) == "Uninsured Rate (2015)"] <- "Unin_Rate_2015"
names(df)[names(df) == "Uninsured Rate Change (2010-2015)"] <- "Unin_Rate_Change_2010_2015"
names(df)[names(df) == "Health Insurance Coverage Change (2010-2015)"] <- "Health_In_Co_Change_2010_2015"
names(df)[names(df) == "Employer Health Insurance Coverage (2015)"] <- "Ep_Health_In_Co_2015"
names(df)[names(df) == "Marketplace Health Insurance Coverage (2016)"] <- "Mp_Health_In_Co_2016"
names(df)[names(df) == "Marketplace Tax Credits (2016)"] <- "Mp_Tax_Credits_2016"
names(df)[names(df) == "Average Monthly Tax Credit (2016)"] <- "Avg_Monthly_Tax_Credit_2016"
names(df)[names(df) == "State Medicaid Expansion (2016)"] <- "State_Medic_Exp_2016"
names(df)[names(df) == "Medicaid Enrollment (2013)"] <- "Medicd_Enrol_2013"
names(df)[names(df) == "Medicaid Enrollment (2016)"] <- "Medicd_Enrol_2016"
names(df)[names(df) == "Medicaid Enrollment Change (2013-2016)"] <- "Medicd_Enrol_2013_2016"
names(df)[names(df) == "Medicare Enrollment (2016)"] <- "Medicr_Enrol_2016"
# Factors/Measures conversion
df[, c("Unin_Rate_2010", "Unin_Rate_2015", "Unin_Rate_Change_2010_2015")] <- lapply(
df[, c("Unin_Rate_2010", "Unin_Rate_2015", "Unin_Rate_Change_2010_2015")],
function(x) as.numeric(sub("%", "", x))/100)
df$Avg_Monthly_Tax_Credit_2016 <- as.numeric(sub("\\$", "", df$Avg_Monthly_Tax_Credit_2016))
df$State_Medic_Exp_2016 <- as.numeric(df$State_Medic_Exp_2016)
# Drop the last row since it is the total of united states
df <- df[-nrow(df), ]
# Dataframe display after conversion
df
## # A tibble: 51 × 14
## State Unin_Rate_2010 Unin_Rate_2015 Unin_Rate_Change_2010_2015
## <chr> <dbl> <dbl> <dbl>
## 1 Alabama 0.146 0.101 -0.045
## 2 Alaska 0.199 0.149 -0.05
## 3 Arizona 0.169 0.108 -0.061
## 4 Arkansas 0.175 0.095 -0.08
## 5 California 0.185 0.086 -0.099
## 6 Colorado 0.159 0.081 -0.078
## 7 Connecticut 0.091 0.06 -0.031
## 8 Delaware 0.097 0.059 -0.038
## 9 District of Columbia 0.076 0.038 -0.038
## 10 Florida 0.213 0.133 -0.08
## # ℹ 41 more rows
## # ℹ 10 more variables: Health_In_Co_Change_2010_2015 <dbl>,
## # Ep_Health_In_Co_2015 <dbl>, Mp_Health_In_Co_2016 <dbl>,
## # Mp_Tax_Credits_2016 <dbl>, Avg_Monthly_Tax_Credit_2016 <dbl>,
## # State_Medic_Exp_2016 <dbl>, Medicd_Enrol_2013 <dbl>,
## # Medicd_Enrol_2016 <dbl>, Medicd_Enrol_2013_2016 <dbl>,
## # Medicr_Enrol_2016 <dbl>
# Data inspection
# Statistics function
summarize_numeric = function(dataset) {
dataset = select_if(dataset, is.numeric)
summary.table = data.frame(Attribute = names(dataset))
summary.table = summary.table %>%
mutate("Missing Values" = apply(dataset, 2, function (x) sum(is.na(x))),
"Unique Values" = apply(dataset, 2, function (x) length(unique(x))),
"Mean" = colMeans(dataset, na.rm = TRUE),
"Min" = apply(dataset, 2, function (x) min(x, na.rm = TRUE)),
"Max" = apply(dataset, 2, function (x) max(x, na.rm = TRUE)),
"SD" = apply(dataset, 2, function (x) sd(x, na.rm = TRUE))
)
summary.table
}
# Statistics display
summarize_numeric(df)
## Attribute Missing Values Unique Values Mean
## 1 Unin_Rate_2010 0 45 1.415490e-01
## 2 Unin_Rate_2015 0 42 8.721569e-02
## 3 Unin_Rate_Change_2010_2015 0 39 -5.433333e-02
## 4 Health_In_Co_Change_2010_2015 0 51 3.840980e+05
## 5 Ep_Health_In_Co_2015 0 51 3.378275e+06
## 6 Mp_Health_In_Co_2016 0 51 2.172810e+05
## 7 Mp_Tax_Credits_2016 0 51 1.841100e+05
## 8 Avg_Monthly_Tax_Credit_2016 0 47 2.921569e+02
## 9 State_Medic_Exp_2016 0 2 6.274510e-01
## 10 Medicd_Enrol_2013 2 50 1.150867e+06
## 11 Medicd_Enrol_2016 0 51 1.441822e+06
## 12 Medicd_Enrol_2013_2016 2 50 3.286971e+05
## 13 Medicr_Enrol_2016 0 51 1.095961e+06
## Min Max SD
## 1 0.044 0.237 4.201300e-02
## 2 0.028 0.171 3.172527e-02
## 3 -0.103 -0.016 2.133229e-02
## 4 15000.000 3826000.000 6.060467e+05
## 5 335000.000 19552000.000 3.657440e+06
## 6 13313.000 1531714.000 3.135795e+05
## 7 1224.000 1428712.000 2.812288e+05
## 8 178.000 750.000 8.600055e+01
## 9 0.000 1.000 4.882944e-01
## 10 67518.000 7755381.000 1.464344e+06
## 11 63583.000 11843081.000 1.935859e+06
## 12 -3935.000 4087700.000 5.909484e+05
## 13 88966.000 5829777.000 1.147094e+06
The statistics show that the uninsurance rate across the states has declined over the five years from 2010. More than half of the states expanded their state medicares in 2016. The number of Medicaid enrollments has increased, on average, between 2013 and 2016.
# Missing value treatment using mean
df$Medicd_Enrol_2013[is.na(df$Medicd_Enrol_2013)] <- mean(df$Medicd_Enrol_2013, na.rm = TRUE)
df$Medicd_Enrol_2013_2016[is.na(df$Medicd_Enrol_2013_2016)] <- mean(df$Medicd_Enrol_2013_2016, na.rm = TRUE)
# Check missing value again
sum(is.na(df))
## [1] 0
# Outlier detection using Z-Score
# Function
detect_outliers <- function(column) {
z_scores <- scale(column)
abs_z_scores <- abs(z_scores)
outliers <- abs_z_scores > 3
return(outliers)
}
# Apply the function to all numeric columns
numeric_columns <- sapply(df, is.numeric)
outliers_df <- df[, numeric_columns]
outliers_indices <- logical(0)
for (col in names(outliers_df)) {
outliers_indices <- outliers_indices | detect_outliers(outliers_df[[col]])
}
# Check if there are any outliers
if (any(outliers_indices)) {
outliers_data <- df[outliers_indices, ]
print(outliers_data)
} else {
cat("No outliers detected in any numeric column.")
}
## No outliers detected in any numeric column.
# Load a shapefile of US states
states_shapefile <- st_read("/Users/guhanxi/Desktop/Health Insurance Project/tl_2023_us_state")
## Reading layer `tl_2023_us_state' from data source
## `/Users/guhanxi/Desktop/Health Insurance Project/tl_2023_us_state'
## using driver `ESRI Shapefile'
## Simple feature collection with 56 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS: NAD83
# Merge the shapefile
merged_data <- merge(states_shapefile, df, by.x = "NAME", by.y = "State")
# Plot the interactive heatmap
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(merged_data) +
tm_borders() +
tm_fill("Unin_Rate_Change_2010_2015", palette = "RdYlGn", title = "Uninsured Rate Change 2010-2015")